1 Executive Summary

  1. Try to predict season win number
  2. Try to predict game win / lose with stats

2 Introduction

3 Loading and Exploring Data

3.1 Loading libraries required

library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)

3.3 Data Size and structure

dim(games_detail_df)
## [1] 645953     29
str(games_detail_df)
## 'data.frame':    645953 obs. of  29 variables:
##  $ GAME_ID          : int  22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 ...
##  $ TEAM_ID          : int  1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 ...
##  $ TEAM_ABBREVIATION: chr  "MIN" "MIN" "MIN" "MIN" ...
##  $ TEAM_CITY        : chr  "Minnesota" "Minnesota" "Minnesota" "Minnesota" ...
##  $ PLAYER_ID        : int  1630162 1630183 1626157 1627736 1626156 1629675 1629162 1629669 1627752 1629006 ...
##  $ PLAYER_NAME      : chr  "Anthony Edwards" "Jaden McDaniels" "Karl-Anthony Towns" "Malik Beasley" ...
##  $ NICKNAME         : chr  "Anthony" "Jaden" "Karl-Anthony" "Malik" ...
##  $ START_POSITION   : chr  "F" "F" "C" "G" ...
##  $ COMMENT          : chr  "" "" "" "" ...
##  $ MIN              : chr  "36:22" "23:54" "25:17" "30:52" ...
##  $ FGM              : num  4 6 4 4 3 3 2 6 3 0 ...
##  $ FGA              : num  10 8 9 9 13 8 5 13 8 0 ...
##  $ FG_PCT           : num  0.4 0.75 0.444 0.444 0.231 0.375 0.4 0.462 0.375 0 ...
##  $ FG3M             : num  3 1 1 4 1 1 0 2 2 0 ...
##  $ FG3A             : num  8 3 3 9 6 2 1 5 5 0 ...
##  $ FG3_PCT          : num  0.375 0.333 0.333 0.444 0.167 0.5 0 0.4 0.4 0 ...
##  $ FTM              : num  4 1 6 0 7 4 1 2 3 0 ...
##  $ FTA              : num  4 1 8 0 7 4 1 2 5 0 ...
##  $ FT_PCT           : num  1 1 0.75 0 1 1 1 1 0.6 0 ...
##  $ OREB             : num  0 2 1 0 0 3 0 0 0 0 ...
##  $ DREB             : num  8 4 9 3 6 7 1 0 2 0 ...
##  $ REB              : num  8 6 10 3 6 10 1 0 2 0 ...
##  $ AST              : num  5 0 0 1 9 1 3 1 1 0 ...
##  $ STL              : num  3 0 0 1 1 3 3 0 1 0 ...
##  $ BLK              : num  1 2 0 0 0 2 0 0 0 0 ...
##  $ TO               : num  1 2 3 1 5 1 0 0 1 0 ...
##  $ PF               : num  1 6 4 4 0 1 1 0 2 0 ...
##  $ PTS              : num  15 14 15 12 14 11 5 16 11 0 ...
##  $ PLUS_MINUS       : num  5 10 14 20 17 -7 -10 -5 1 0 ...
dim(games_df)
## [1] 25796    21
str(games_df)
## 'data.frame':    25796 obs. of  21 variables:
##  $ GAME_DATE_EST   : chr  "2022-03-12" "2022-03-12" "2022-03-12" "2022-03-12" ...
##  $ GAME_ID         : int  22101005 22101006 22101007 22101008 22101009 22101010 22101011 22100995 22100996 22100997 ...
##  $ GAME_STATUS_TEXT: chr  "Final" "Final" "Final" "Final" ...
##  $ HOME_TEAM_ID    : int  1610612748 1610612741 1610612759 1610612744 1610612743 1610612762 1610612757 1610612753 1610612737 1610612738 ...
##  $ VISITOR_TEAM_ID : int  1610612750 1610612739 1610612754 1610612749 1610612761 1610612758 1610612764 1610612750 1610612746 1610612765 ...
##  $ SEASON          : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
##  $ TEAM_ID_home    : int  1610612748 1610612741 1610612759 1610612744 1610612743 1610612762 1610612757 1610612753 1610612737 1610612738 ...
##  $ PTS_home        : num  104 101 108 122 115 134 127 118 112 114 ...
##  $ FG_PCT_home     : num  0.398 0.443 0.412 0.484 0.551 0.558 0.516 0.465 0.478 0.467 ...
##  $ FT_PCT_home     : num  0.76 0.933 0.813 0.933 0.75 0.71 0.909 0.88 0.895 0.8 ...
##  $ FG3_PCT_home    : num  0.333 0.429 0.324 0.4 0.407 0.39 0.367 0.4 0.29 0.188 ...
##  $ AST_home        : num  23 20 28 33 32 21 21 31 28 23 ...
##  $ REB_home        : num  53 46 52 55 39 44 43 49 47 47 ...
##  $ TEAM_ID_away    : int  1610612750 1610612739 1610612754 1610612749 1610612761 1610612758 1610612764 1610612750 1610612746 1610612765 ...
##  $ PTS_away        : num  113 91 119 109 127 125 118 110 106 103 ...
##  $ FG_PCT_away     : num  0.422 0.419 0.489 0.413 0.471 0.5 0.47 0.456 0.488 0.422 ...
##  $ FT_PCT_away     : num  0.875 0.824 1 0.696 0.76 0.857 0.963 1 0.824 0.958 ...
##  $ FG3_PCT_away    : num  0.357 0.208 0.389 0.386 0.387 0.394 0.412 0.333 0.375 0.294 ...
##  $ AST_away        : num  21 19 23 27 28 27 26 24 22 21 ...
##  $ REB_away        : num  46 40 47 39 50 33 35 37 36 42 ...
##  $ HOME_TEAM_WINS  : int  0 1 0 1 0 1 1 1 1 1 ...

4 Exploring and Understanding of data set

4.1 Finding the types of games recorded

length(unique(games_df$GAME_ID[(games_df$TEAM_ID_home == teams_df$TEAM_ID[which(teams_df$ABBREVIATION ==
    "GSW")] | games_df$TEAM_ID_away == teams_df$TEAM_ID[which(teams_df$ABBREVIATION ==
    "GSW")]) & games_df$SEASON == 2019]))
## [1] 70

From the data set, Golden State Warriors played 70 games in season 2019-2020, while in fact, they only played 65 games. The data set seems to record every game including preseasons and playoff

temp <- games_df
temp$GAME_DATE_EST <- as.POSIXlt(games_df$GAME_DATE_EST, "%Y-%m-%d",
    tz = "EST")
temp$Month_EST <- month(temp$GAME_DATE_EST)
temp$DAY_EST <- day(temp$GAME_DATE_EST)
temp$YEAR_EST <- year(temp$GAME_DATE_EST)
temp$MONTH_DAY <- mday(temp$GAME_DATE_EST)
temp$DAYOFYEAR <- yday(temp$GAME_DATE_EST)
range(temp$DAYOFYEAR)
## [1]   1 366

Game distribution throughout the year (ignore 2021-2022 season as it is incomplete)

ggplot(temp, aes(x = DAYOFYEAR)) + geom_histogram(binwidth = 1)

fig1 <- plot_ly(x = ~temp$DAYOFYEAR[!temp$SEASON == 2021], type = "histogram",
    nbinsx = 366)
fig1